Loading packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(leaflet)
library(sf)
## Warning: package 'sf' was built under R version 4.1.2
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
library(raster)
## Warning: package 'raster' was built under R version 4.1.2
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
## 
##     select
library(sp)

Loading 2019 data

file_name <- "../data/bluebikes_tripdata_2019.csv"
# read:
data_2019 <- read_csv(file_name, progress = FALSE)
## Rows: 2522771 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): start station name, end station name, usertype
## dbl  (12): tripduration, start station id, start station latitude, start sta...
## dttm  (2): starttime, stoptime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2019 <- data_2019 %>% rename("start_long" = "start station longitude",
                                      "start_lat" = "start station latitude",
                                      "end_long" = "end station longitude",
                                      "end_lat" = "end station latitude",
                                      "start_station_id" = "start station id",
                                      "start_station_name" = "start station name",
                                      "end_station_name" = "end station name",
                                      "birth_year" = "birth year")

Basic data explorations

Exploring basic patterns in the data that might be interesting for further exploration

# subscribers vs. customers (single use)
data_2019 %>% filter(usertype == "Subscriber") %>% count() # returns 1988494
## # A tibble: 1 × 1
##         n
##     <int>
## 1 1988494
data_2019 %>% filter(usertype == "Customer") %>% count() # returns 534277
## # A tibble: 1 × 1
##        n
##    <int>
## 1 534277
# genders 
data_2019 %>% filter(gender == 0) %>% count() # 277703 (female?)
## # A tibble: 1 × 1
##        n
##    <int>
## 1 277703
data_2019 %>% filter(gender == 1) %>% count() # 1652699 (male?)
## # A tibble: 1 × 1
##         n
##     <int>
## 1 1652699
data_2019 %>% filter(gender == 2) %>% count() # 592369 (other?)
## # A tibble: 1 × 1
##        n
##    <int>
## 1 592369
# age range
data_2019 %>% dplyr::select(birth_year) %>% range() # return 1886 2003
## [1] 1886 2003
data_2019 %>% filter(birth_year == 1886) %>% count() # returns 3
## # A tibble: 1 × 1
##       n
##   <int>
## 1     3
# trip duration range
data_2019 %>% dplyr::select(tripduration) %>% range() # 61 42567137
## [1]       61 42567137

There are many more subscribers which suggests that it’s mostly locals and long-term visitors using the bikes.

It seems that there’s something off about the gender data, so I will not include that.

There is also something weird about the age data - there are three entries of 1888 and just in general quite old people so again, I will not use this information either.

There is also a massive range in trip duration but that’s not necessarily an error

On the basis of this exploration I select the columns I want to use to reduce the strain on my computer when working with such large files

Cleaning the data prior to analysis

cleaned_2019 <- data_2019 %>% dplyr::select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))

Selecting the unique start stations in 2019 and removing outliers

unique_start <- cleaned_2019 %>% dplyr::select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start <- unique_start[!(unique_start$start_long == 0.0000|unique_start$start_lat == 0.0000), ]

# other outliers
unique_start <- unique_start %>% filter(start_station_name != "BCBS Hingham")
unique_start <- unique_start %>% filter(start_station_name != "Main St at Beacon St")

A quick map

leaflet() %>% addTiles() %>% setView(lng = -71.067083, lat = 42.341145, zoom = 11.5) %>% addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% addScaleBar()

Doing the same for the end stations

unique_end <- cleaned_2019 %>% dplyr::select(c(end_long, end_lat, end_station_name)) %>% unique()
# I do the same for the end stations
unique_end <- unique_end[!(unique_end$end_long == 0.0000|unique_end$end_lat == 0.000), ]
# another outlier
unique_end <- unique_end %>% filter(end_station_name != "BCBS Hingham")

Spatial analysis

Figuring out where people are going

load Boston city boundary and neighborhoods

# neighborhoods
neighborhoods <- st_read("../data/Boston_Neighborhoods/Boston_Neighborhoods.shp")
## Reading layer `Boston_Neighborhoods' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Boston_Neighborhoods/Boston_Neighborhoods.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 26 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 739715.3 ymin: 2908294 xmax: 812255.1 ymax: 2970086
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_neighborhoods <- st_transform(neighborhoods, 4326)

Plot the neighborhoods and stations together start stations and neighborhoods

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "transparent", color = "black", weight = 3,
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.067083, lat = 42.331145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start$start_long,
                   lat = unique_start$start_lat,
                   popup = p2_popup) %>% 
   addScaleBar()

From this map, we can get an idea of where in Boston are moving, both within the larger city boundary and within neighbors of the city.

To get the intersections of the points and the neighborhoods, I create a convex hull around the points. First I transform the longitude and latitude coordinates into a sf object.

# remove NAs if there is any
points <- unique_start %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

# transform points to sf object
points_sf <- st_as_sf(points, coords = c("start_long", "start_lat"), crs = 4326)
points_sf
## Simple feature collection with 406 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -71.16649 ymin: 42.2679 xmax: -71.0061 ymax: 42.4148
## Geodetic CRS:  WGS 84
## # A tibble: 406 × 2
##    start_station_name                                           geometry
##  * <chr>                                                     <POINT [°]>
##  1 Dartmouth St at Newbury St                       (-71.07783 42.35096)
##  2 MIT Stata Center at Vassar St / Main St          (-71.09116 42.36213)
##  3 Inman Square at Springfield St.                  (-71.10016 42.37438)
##  4 Third at Binney                                  (-71.08277 42.36544)
##  5 Verizon Innovation Hub 10 Ware Street            (-71.11305 42.37251)
##  6 University Park                                  (-71.10006 42.36265)
##  7 One Kendall Square at Hampshire St / Portland St (-71.09169 42.36628)
##  8 359 Broadway - Broadway at Fayette Street         (-71.10441 42.3708)
##  9 Prudential Center - 101 Huntington Ave           (-71.08066 42.34652)
## 10 Encore                                           (-71.07245 42.39329)
## # … with 396 more rows
# transform its crs 
points_3035 <- st_transform(points_sf, crs = 3035)
points_3035
## Simple feature collection with 406 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -902380.1 ymin: 5513497 xmax: -885856.2 ymax: 5529263
## Projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 406 × 2
##    start_station_name                                          geometry
##  * <chr>                                                    <POINT [m]>
##  1 Dartmouth St at Newbury St                         (-892912 5521013)
##  2 MIT Stata Center at Vassar St / Main St          (-892186.8 5522717)
##  3 Inman Square at Springfield St.                  (-891238.4 5524152)
##  4 Third at Binney                                  (-891637.1 5522270)
##  5 Verizon Innovation Hub 10 Ware Street            (-891770.1 5525035)
##  6 University Park                                  (-892377.8 5523436)
##  7 One Kendall Square at Hampshire St / Portland St (-891797.9 5523009)
##  8 359 Broadway - Broadway at Fayette Street        (-891702.1 5524264)
##  9 Prudential Center - 101 Huntington Ave           (-893420.9 5520963)
## 10 Encore                                           (-888647.6 5523156)
## # … with 396 more rows
# create convex hull
points_ch <- st_convex_hull(st_union(points_3035))
points_ch
## Geometry set for 1 feature 
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -902380.1 ymin: 5513497 xmax: -885856.2 ymax: 5529263
## Projected CRS: ETRS89-extended / LAEA Europe
## POLYGON ((-899151.8 5513497, -901419.5 5517212,...
plot(points_ch)

class(points_ch)
## [1] "sfc_POLYGON" "sfc"

I have now created a polygon out of the start station points.

Let’s get the intersection between this polygon and the neighborhoods multipolygon

# make sure they have the same crs
neighborhoods_3035 <- st_transform(neighborhoods, crs = 3035)

# get the intersections
points_neighborhoods_intersection <- st_intersection(points_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff <- st_union(points_neighborhoods_intersection)

Plot the objects together

plot(points_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)

The plot shows that the neighborhoods occupy a significant amount of space within the points polygon. However, there is also a large area that the neighborhoods do not cover.

I do the same for the end stations to see if we find the same patterns.

# remove NAs if there is any
points_end <- unique_end %>% 
  filter(! is.na(end_long)) %>% 
  filter(! is.na(end_lat))
# transform the points into an sf object 
points_end_sf <- st_as_sf(points_end, coords = c("end_long", "end_lat"), crs = 4326)
# transform their crs
points_end_3035 <- st_transform(points_end_sf, crs = 3035)
# create convex hull
points_end_ch <- st_convex_hull(st_union(points_end_3035))

# plot it
plot(points_end_ch)

class(points_end_ch)
## [1] "sfc_POLYGON" "sfc"

The polygon looks very much identical to the start stations polyogon

Let’s get the intersection

# get the intersections
points_neighborhoods_intersection_end <- st_intersection(points_end_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_end <- st_union(points_neighborhoods_intersection_end)

Plot the objects together

plot(points_end_ch)
plot(points_neighborhoods_intersection_end, border='red', lwd=2, add = TRUE)
plot(p_n_buff_end, add = TRUE)

It looks very much the same for the end stations.

Once again, the map confirms what we saw in the plot. There is an area beyond the city boundary towards the north where there are a lot of points.

Loading the 2020 data

Let’s look at the 2020 data

file_name <- "../data/bluebikes_tripdata_2020.csv"
# read:
data_2020 <- read_csv(file_name, progress = FALSE)
## Rows: 1999446 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): start station name, end station name, usertype, postal code
## dbl  (12): tripduration, start station id, start station latitude, start sta...
## dttm  (2): starttime, stoptime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2020 <- data_2020 %>% rename("start_long" = "start station longitude",
                                      "start_lat" = "start station latitude",
                                      "end_long" = "end station longitude",
                                      "end_lat" = "end station latitude",
                                      "start_station_id" = "start station id",
                                      "start_station_name" = "start station name",
                                      "end_station_name" = "end station name",
                                      "birth_year" = "birth year")

Clearning the data prior to analysis

Let’s clean the 2020 data

cleaned_2020 <- data_2020 %>% dplyr::select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))

cleaned_2020
## # A tibble: 1,999,446 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1         1793 2020-11-01 00:00:18 2020-11-01 00:30:12      42.3      -71.0
##  2         1832 2020-11-01 00:00:34 2020-11-01 00:31:07      42.3      -71.0
##  3          262 2020-11-01 00:01:54 2020-11-01 00:06:17      42.3      -71.0
##  4          419 2020-11-01 00:04:00 2020-11-01 00:10:59      42.4      -71.1
##  5          275 2020-11-01 00:04:01 2020-11-01 00:08:36      42.4      -71.1
##  6          684 2020-11-01 00:04:39 2020-11-01 00:16:03      42.3      -71.1
##  7          608 2020-11-01 00:04:43 2020-11-01 00:14:52      42.4      -71.1
##  8          438 2020-11-01 00:05:57 2020-11-01 00:13:15      42.4      -71.1
##  9          541 2020-11-01 00:07:15 2020-11-01 00:16:16      42.4      -71.1
## 10         1138 2020-11-01 00:07:17 2020-11-01 00:26:16      42.4      -71.1
## # … with 1,999,436 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>

Spatial Analysis

Figuring out where people are going and comparison with the 2019 data

I’ll only look at the start stations

unique_start_2020 <- cleaned_2020 %>% dplyr::select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start_2020 <- unique_start_2020[!(unique_start_2020$start_long == 0.0000|unique_start_2020$start_lat == 0.0000), ]

# another outlier
unique_start_2020 <- unique_start_2020 %>% filter(start_station_name != "BCBS Hingham")

unique_end_2020 <- cleaned_2020 %>% dplyr::select(c(end_long, end_lat, end_station_name)) %>% unique()

unique_end_2020 <- unique_end_2020[!(unique_end_2020$end_long == 0.0000|unique_end_2020$end_lat == 0.0000), ]

unique_end_2020 <- unique_end_2020 %>% filter(end_station_name != "BCBS Hingham")

Let’s plot the start stations with the neighborhoods

# plot the 2020 start stations
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p5_popup <- paste0("<strong>Start station: </strong>", unique_start_2020$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start_2020$start_long,
                   lat = unique_start_2020$start_lat,
                   popup = p5_popup)

let’s look at the 2019 data for comparison

p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start$start_long,
                   lat = unique_start$start_lat,
                   popup = p2_popup)

We can see from the maps that there are areas/stations that aren’t in the 2019 data which are also outside of the city boundary. These are stations towards the west of the city boundary, beyond Brighton.

Let’s look at the intersection between the points and neighborhoods in 2020

# remove NAs if there is any
points_2020 <- unique_start_2020 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))
# transform the points into an sf object 
points_2020_sf <- st_as_sf(points_2020, coords = c("start_long", "start_lat"), crs = 4326)
# transform their crs
points_2020_3035 <- st_transform(points_2020_sf, crs = 3035)
# create convex hull
points_2020_ch <- st_convex_hull(st_union(points_2020_3035))

# plot it
plot(points_2020_ch)

class(points_end_ch)
## [1] "sfc_POLYGON" "sfc"
# get the intersections
points_neighborhoods_intersection_2020 <- st_intersection(points_2020_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_2020 <- st_union(points_neighborhoods_intersection_2020)

Plot the objects together

plot(points_2020_ch)
plot(points_neighborhoods_intersection_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_2020, add = TRUE)

compare this the 2019 start stations

plot(points_end_ch)
plot(points_neighborhoods_intersection_end, border='red', lwd=2, add = TRUE)
plot(p_n_buff_end, add = TRUE)

The two intersections look only slightly different.

Let’s find the most used start station in 2019 and 2020

# got the function from here: https://stackoverflow.com/questions/66787325/how-to-find-the-least-frequent-value-in-a-column-of-dataframe-in-r

Modemin <- function(x){
  a = table(x) # x is a column
  return(a[which.min(a)])
}
# least used start station in 2020: MTL-ECO4-01 (1 time)
Modemin(data_2020$start_station_name)
## MTL-ECO4-01 
##           1
# least used start station in 2019: 8D QC Station 01 (1 time)
Modemin(data_2019$start_station_name)
## 8D QC Station 01 
##                1
# lest used end station in 2020: Mobile Temporary Station 1 (1 time)
Modemin(data_2020$end_station_name)
## Mobile Temporary Station 1 
##                          1
# lest used end station in 2019: 8D QC Station 02 (1 time)
Modemin(data_2019$end_station_name)
## 8D QC Station 02 
##                1
Modemax <- function(x){
  a = table(x) # x is a column
  return(a[which.max(a)])
}
# most used start station in 2019: MIT at Mass Ave / Amherst St (61056 times)
Modemax(data_2019$start_station_name)
## MIT at Mass Ave / Amherst St 
##                        61056
# most used start station in 2020: Central Square at Mass Ave / Essex St (32668 times )
Modemax(data_2020$start_station_name)
## Central Square at Mass Ave / Essex St 
##                                 32668
# most used end station in 2019: MIT at Mass Ave / Amherst St (56986 times)
Modemax(data_2019$end_station_name)
## MIT at Mass Ave / Amherst St 
##                        56986
# most used end station in 2020: Central Square at Mass Ave / Essex St (33493 times)
Modemax(data_2020$end_station_name)
## Central Square at Mass Ave / Essex St 
##                                 33493

So the most used stations in 2020 are used significantly less than the most used station in 2019. Interestingly, the most used start and end stations in 2019 is the same and the most used start and end station in 2020 is also the same.

Let’s find out where these stations are and plot them with leaflet

# most used station in 2019
data_2019 %>% filter(start_station_name == "MIT at Mass Ave / Amherst St") %>% dplyr::select(c(start_long, start_lat))
## # A tibble: 61,056 × 2
##    start_long start_lat
##         <dbl>     <dbl>
##  1      -71.1      42.4
##  2      -71.1      42.4
##  3      -71.1      42.4
##  4      -71.1      42.4
##  5      -71.1      42.4
##  6      -71.1      42.4
##  7      -71.1      42.4
##  8      -71.1      42.4
##  9      -71.1      42.4
## 10      -71.1      42.4
## # … with 61,046 more rows
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 12) %>% addCircleMarkers(lng = -71.1, lat = 42.4) %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% addScaleBar()
# most used station in 2020
data_2020 %>% filter(start_station_name == "Central Square at Mass Ave / Essex St") %>% dplyr::select(c(start_long, start_lat))
## # A tibble: 32,668 × 2
##    start_long start_lat
##         <dbl>     <dbl>
##  1      -71.1      42.4
##  2      -71.1      42.4
##  3      -71.1      42.4
##  4      -71.1      42.4
##  5      -71.1      42.4
##  6      -71.1      42.4
##  7      -71.1      42.4
##  8      -71.1      42.4
##  9      -71.1      42.4
## 10      -71.1      42.4
## # … with 32,658 more rows

As it turn out the two stations have the same longitude and latitude coordinates. Interestingly, the most used station is far outside the city boundary. From the station name in the 2019 data, it seems that this might be a station close to MIT, so let’s find MIT’s coordinates and plot it along side the point.

p_popup = paste0("<strong>Start station: </strong>", "MIT")
p_popup2 = paste0("<strong>institution: </strong>", "Central Square at Mass Ave / Essex St")

leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 12) %>% addCircleMarkers(lng = -71.1, lat = 42.4, popup = p_popup2) %>% 
  addCircleMarkers(lng = -71.092003, lat = 42.360001, popup = p_popup, color = "red") %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5)

So the central campus of MIT is not near the most used stations in 2019 and 2020, but plotting the different higher learning institutions located in Boston might yield interesting results seeing as many of the stations are clustered around this area.

Load colleges and universities

uni <- st_read("../data/Colleges_and_Universities/Colleges_and_Universities.shp")
## Reading layer `Colleges_and_Universities' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Colleges_and_Universities/Colleges_and_Universities.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 60 features and 28 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 742669.8 ymin: 2917826 xmax: 780798.5 ymax: 2964159
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_uni <- st_transform(uni, 4326)

projected_uni <- projected_uni[!(projected_uni$Longitude == 0.0000|projected_uni$Latitude == 0.0000), ]

Plot the colleges and universities on a map together with the start stations from 2019 and 2020

p7_popup <- paste0("<strong>Institution: </strong>", projected_uni$Name)

leaflet() %>% addTiles() %>% 
  addCircleMarkers(lng = projected_uni$Longitude, lat = projected_uni$Latitude, color = "red", popup = p7_popup) %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5) %>% addScaleBar()

It does in fact seem that many of the start station points are centered around the location of the colleges and universities.

Let’s make the colleges and universities into a polygon and plot it again

# remove NAs if there is any
points_uni <- projected_uni %>% 
  filter(! is.na(Longitude)) %>% 
  filter(! is.na(Latitude))
# transform points into sf object
points_uni_sf <- st_as_sf(points_uni, coords = c("Longitude", "Latitude"), crs = 4326)
# transform its crs 
points_uni_3035 <- st_transform(points_uni_sf, crs = 3035)
# create convex hull
points_uni_ch <- st_convex_hull(st_union(points_uni_3035))
# plot it
plot(points_uni_ch)

# give it projected crs again
points_uni_ch_3035 <- st_transform(points_uni_ch, crs = 4326)

Let’s plot the colleges and universities polygon together the with 2019 start stations

leaflet(points_uni_ch_3035) %>% addTiles() %>% 
  addPolygons(color = "yellow") %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)

So indeed, I does seem that the colleges and institutions take up a fairly large part of the space the points occupy. Let’s confirm this be getting the intersection between the points-polygon and the colleges and universities-polygon

# get the intersections
uni_points_intersection <- st_intersection(points_ch, points_uni_ch)

# create buffer feature of the neighborhoods
p_n_buff_uni <- st_union(uni_points_intersection)

Plot the objects together

plot(points_ch)
plot(uni_points_intersection, add = TRUE)
plot(points_uni_ch, border = "red", add = TRUE)

Compare with the neighborhoods and points intersection

plot(points_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)

Indeed, the intersection of the colleges and universities occupy around the same amount of space as the neighborhoods. However, they also occupy the same space within the points-polygon, and I therefore find the intersection between the neighborhoods and the colleges and universities

neighborhoods_uni_intersection <- st_intersection(points_uni_ch, neighborhoods_3035)

plot(points_uni_ch)
plot(neighborhoods_uni_intersection, border="red", lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)

So, the neighborhoods and the learning institutions occupy much of the same space. Therefore, it is not possible to reliably confirm that the the users of the bikes are primarily seeking out the learning institutions. However, the overlay of the institutions on the points, did capture a real pattern of use just like the neighborhoods did.

Let’s look at the map with the most used start/end station again

p_popup = paste0("<strong>Start station: </strong>", "MIT")
p_popup2 = paste0("<strong>institution: </strong>", "Central Square at Mass Ave / Essex St")

# with slight changes to the view setting
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 15.5) %>% addCircleMarkers(lng = -71.1, lat = 42.4, popup = p_popup2) %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5)

When zooming in, it seems that the most used station is located close to several schools, and it might therefore yield interesting results to find the location of the schools located within Boston

Here I overlay data from public and non-public schools

public <- st_read("../data/Public_Schools/Public_Schools.shp")
## Reading layer `Public_Schools' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Public_Schools/Public_Schools.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 131 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 744291.9 ymin: 2910470 xmax: 790128.2 ymax: 2968120
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
public <- st_transform(public, 4326)
public <- st_as_sf(public, crs = 4326)
public <- st_convex_hull(public$geometry)
# extract coordinates
public_coordinates <- st_coordinates(public)
class(public_coordinates)
## [1] "matrix" "array"
# transform to data frame
public_df <- public_coordinates %>% 
  as_tibble()


# I do the same for the non-public schools
non_public <- st_read("../data/Non_Public_Schools/Non_Public_Schools.shp")
## Reading layer `Non_Public_Schools' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Non_Public_Schools/Non_Public_Schools.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 82 features and 14 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 746614.7 ymin: 2912741 xmax: 791398.7 ymax: 2965487
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
non_public <- st_transform(non_public, 4326)
non_public <- st_as_sf(non_public, crs = 4326)
non_public <- st_convex_hull(non_public$geometry)
# extract coordinates
non_public_coordinates <- st_coordinates(non_public)
class(non_public_coordinates)
## [1] "matrix" "array"
# transform to data frame
non_public_df <- non_public_coordinates %>% 
  as_tibble()

non_public_df <- non_public_coordinates %>% as_tibble()

Now we can plot the schools at points on the map alongside the start stations

p_popup <- paste0("Public")
p_popup2 <- paste0("Non-public")

leaflet() %>% addTiles() %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% 
  addCircleMarkers(lng = public_df$X, lat = public_df$Y, color = "red", popup = p_popup) %>% 
  addCircleMarkers(lng = non_public_df$X, lat = non_public_df$Y, color = "yellow", popup = p_popup2) %>% setView(lng = -71.1, lat = 42.33, zoom = 11.5) %>% addScaleBar()

This plot seems to reflect a usage pattern dominat in the lower part of the city. There are many overlaps of the points. Let’s transform the schools into a polygon and plot again. I do this by finding the collected shared space between the points

non_public_sf <-  st_as_sf(non_public_df, coords = c("X", "Y"), crs = 4326)
class(non_public_sf)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
non_public_sf
## Simple feature collection with 82 features and 0 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -71.16578 ymin: 42.24013 xmax: -70.99945 ymax: 42.38436
## Geodetic CRS:  WGS 84
## # A tibble: 82 × 1
##                geometry
##  *          <POINT [°]>
##  1 (-71.10514 42.32767)
##  2 (-71.05872 42.30139)
##  3 (-71.13015 42.30658)
##  4 (-71.09574 42.34934)
##  5 (-71.13213 42.24426)
##  6 (-71.06087 42.32163)
##  7 (-71.10991 42.26222)
##  8 (-71.12682 42.25037)
##  9 (-71.07193 42.28944)
## 10   (-71.122 42.28169)
## # … with 72 more rows
non_public_sf <- st_transform(non_public_sf, 3035)
non_public_sf
## Simple feature collection with 82 features and 0 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -905296.7 ymin: 5514781 xmax: -887533.8 ymax: 5526907
## Projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 82 × 1
##               geometry
##  *         <POINT [m]>
##  1   (-895919 5521715)
##  2 (-897213.5 5516538)
##  3 (-898647.9 5522373)
##  4 (-893555.8 5522298)
##  5 (-904763.7 5518759)
##  6 (-895304.3 5517929)
##  7 (-902414.1 5518126)
##  8 (-904025.8 5518718)
##  9 (-898735.2 5516836)
## 10 (-900848.8 5520239)
## # … with 72 more rows
non_public_ch <- st_convex_hull(st_union(non_public_sf))
plot(non_public_ch)

plot(non_public_ch)

public_sf <- non_public_coordinates %>% 
  as_tibble() %>% 
  sf::st_as_sf(coords=c(1,2), crs = 4326)
class(public_sf)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
public_sf <- st_transform(public_sf, 3035)

public_ch <- st_convex_hull(st_union(public_sf))
plot(public_ch)

schools <- st_convex_hull(st_union(non_public_ch, public_ch))
plot(schools)

Plot the schools with the stations points

plot(points_ch)
plot(schools, border = "yellow", lwd= 2, add = TRUE)

So the schools also take a around the same space that the neighborhoods did and the colleges and universities did

From these analyzes, it was not possible to confirm what was located to the north of the city where a lot of the points were clustered. However, both the institutions overlay and the schools overlay revealed genuine patterns of use in different parts of the neighborhoods within Boston.